# Deactivate scientific notation
options(scipen = 999, max.print = 10000)

# Seed
seed = 2828
# Load libraries
pacman::p_load(
  kableExtra,
  an9elproject, 
  tidyverse,
  lubridate, 
  magrittr,
  survival,
  survminer, 
  car,
  StepReg,
  plotly,
  install = FALSE, update = FALSE
  )

Load data

# Load DB
# oncoth1 = get_project("oncothr1", version = "0.0.8004")
load("/mnt/ir-bioinf02/home/blobato/oncothromb01/oncothr1_versions/oncothr1.RData_0.0.8004")
oncoth1 = obj
# Get data slot
oncoth1_data = oncoth1$data
# Group tobacco use into 'Never smoked' vs 'Former/current'
oncoth1_data %<>%
  mutate(tobacco_use_imp_grouped = case_when(
    tobacco_use_imp == "Never has smoked" ~ "Never", 
    tobacco_use_imp == "Former smoker" ~ "Former/current",
    tobacco_use_imp == "Active smoker" ~ "Former/current"), 
    .after = tobacco_use_imp)

# Create quartiles for age 
oncoth1_data %<>%
  mutate(age_quartiles = case_when(
    age_when_cancer_dx <= 60 ~ "33-60", 
    age_when_cancer_dx > 60 & age_when_cancer_dx <= 69 ~ "61-69",
    age_when_cancer_dx >= 70 ~ "70-93"
  ), .after = age_when_cancer_dx) %>%
  mutate(age_quartiles = as.factor(age_quartiles))

Create dataset for survival analysis

This dataset must be in the proper format to deal with time-dependent covariates and left- and right-censoring. The function survival::tmerge() helps us with it.

# Get subset of covariates for survival analysis
covariates_df = oncoth1_data %>%
  select(id, 
         patient_group,
         age_when_cancer_dx,
         age_quartiles,
         age_when_cancer_dx_50,
         age_when_cancer_dx_65,
         gender, 
         ethnicity,
         ethnicity_grouped,
         menopausal_status_imp,
         weight, 
         height, 
         body_surface_area,
         bmi_value,
         bmi_category, 
         performance_status_category_corrected_imp,
         hormone_therapy,
         major_surgery_in_medical_history, 
         pregnancy_imp, 
         oral_contraceptive_tx, 
         diabetes_mellitus, 
         dyslipidemia, 
         tobacco_use_imp, 
         tobacco_use_imp_grouped,
         copd_imp, 
         arterial_hypertension, 
         chronic_cardiac_insufficiency, 
         renal_insufficiency, 
         khorana_risk_score_imp,
         krs_category, 
         two_groups_krs, 
         tic_onco_imp,
         previous_onco_surgery, 
         primary_tumor_simplified, 
         tumor_surgically_removed, 
         progression_according_to_clinical_stage_imp, 
         tnm_stage, 
         tnm_stage_detailed, 
         tnm_stage_grouped, 
         tnm_stage_two_groups,
         t_stage_imp, 
         t_stage_grouped,
         n_stage_imp, 
         n_stage_grouped,
         m_stage, 
         m_stage_grouped,
         histology_type_imp, 
         mucinous_histology_imp, 
         grade_histological_differentiation_imp, 
         n_metastases,
         n_metastases_grouped,
         metastasis_adrenal_glands, 
         metastasis_bones,                             
         metastasis_brain,                               
         metastasis_liver,                               
         metastasis_lung,                                
         metastasis_lymph_nodes,                         
         metastasis_peritoneum,                          
         metastasis_pleura,                              
         metastasis_skin,                                
         metastasis_soft_tissues,                        
         other_metastases,
         catheter_device_imp,
         venous_insufficiency_imp, 
         n_comorbidities,
         family_background_vte,
         family_background_vte_n,
         family_background_vte_all_relatives,
         family_background_VTE_father,    
         family_background_VTE_mother,  
         family_background_VTE_siblings, 
         family_background_VTE_offspring, 
         family_background_VTE_other_relatives, 
         previous_vte, 
         previous_ate,
         n_previous_ate,
         previous_ate_type,
         cva_ate,
         myocardial_infarction_ate, 
         pad_ate, 
         tia_ate, 
         other_thromb_ate, 
         anticoag_tx_vte,                                
         anticoag_tx_vte_drugs,                          
         anticoag_tx_cardiopathy,
         anticoag_tx_other_causes, 
         anticoag_tx_reason, 
         anticoag_tx_apixaban,                           
         anticoag_tx_dabigatran,                         
         anticoag_tx_lmwh,                               
         anticoag_tx_lmwh_dosage,                        
         anticoag_tx_vit_k_antag,                        
         anticoag_tx_other_drugs,                        
         anticoag_tx_currently,
         n_antiaggreg_tx, 
         antiaggreg_tx_reason,
         antiaggreg_tx_cardiopathy,                      
         antiaggreg_tx_cva,                              
         antiaggreg_tx_other_causes,
         antiaggreg_tx_currently, 
         tu_1st_blood_transfusion_from_cancer_diagnosis_days, 
         tu_2nd_blood_transfusion_from_cancer_diagnosis_days, 
         tu_3rd_blood_transfusion_from_cancer_diagnosis_days, 
         tu_1st_major_trauma_from_cancer_diagnosis_days, 
         tu_1st_immobilisation_from_cancer_diagnosis_days, 
         tu_2nd_immobilisation_from_cancer_diagnosis_days, 
         tu_3rd_immobilisation_from_cancer_diagnosis_days, 
         tu_1st_recent_major_surgery_from_cancer_diagnosis_days, 
         tu_2nd_recent_major_surgery_from_cancer_diagnosis_days, 
         tu_3rd_recent_major_surgery_from_cancer_diagnosis_days, 
         tu_1st_thromboprophylaxis_hospital_from_cancer_diagnosis_days, 
         tu_1st_erythropoietin_tx_from_cancer_diagnosis_days, 
         tu_2nd_erythropoietin_tx_from_cancer_diagnosis_days, 
         tu_3rd_erythropoietin_tx_from_cancer_diagnosis_days, 
         vte_recurrence)
# Get subset of variables containing information about patient status and time until events
survival_data = oncoth1_data %>%
  select(id, 
         patient_group, 
         date_cancer_dx, 
         date_study_begins, 
         date_study_ends,
         date_death,
         patient_status_at_end_study, 
         censored_patient, 
         follow_up_length_from_cancer_diagnosis_days, 
         follow_up_length_from_cancer_diagnosis_months,
         tu_1st_vte_all_from_cancer_diagnosis_days_corrected,
         tu_1st_vte_all_from_cancer_diagnosis_months_corrected, 
         tu_2nd_vte_all_from_cancer_diagnosis_days_corrected, 
         tu_2nd_vte_all_from_cancer_diagnosis_months_corrected, 
         tu_3rd_vte_all_from_cancer_diagnosis_days_corrected, 
         tu_3rd_vte_all_from_cancer_diagnosis_months_corrected, 
         tu_4th_vte_all_from_cancer_diagnosis_days_corrected, 
         tu_4th_vte_all_from_cancer_diagnosis_months_corrected
         )
# Create object tmerge for managing time-dependant covariates 
# Add time until death/censoring and time until VTE
# For interval censored data, the status indicator is 
# 0=right censored
# 1=event at time 
# 2=left censored
# 3=interval censored
survival_vte_tmerge = tmerge(
  data1 = covariates_df, 
  data2 = survival_data, 
  id = id, 
  tstop = follow_up_length_from_cancer_diagnosis_days, 
  # event() is the survival analysis event of interest (in our case, death)
  death = event(follow_up_length_from_cancer_diagnosis_days, censored_patient), 
  # tdc() are for time-dependent covariates
  vte_event = tdc(tu_1st_vte_all_from_cancer_diagnosis_days_corrected),
  vte_event = tdc(tu_2nd_vte_all_from_cancer_diagnosis_days_corrected),
  vte_event = tdc(tu_3rd_vte_all_from_cancer_diagnosis_days_corrected),
  vte_event = tdc(tu_4th_vte_all_from_cancer_diagnosis_days_corrected)
  )
# Add other time-dependent covariates (i.e., blood transfusion, immobilisation, etc.)
survival_vte_tmerge = tmerge(
  survival_vte_tmerge, 
  survival_vte_tmerge, 
  id = id, 
  blood_transfusion = tdc(tu_1st_blood_transfusion_from_cancer_diagnosis_days),
  blood_transfusion = tdc(tu_2nd_blood_transfusion_from_cancer_diagnosis_days),
  blood_transfusion = tdc(tu_3rd_blood_transfusion_from_cancer_diagnosis_days),
  immobilisation = tdc(tu_1st_immobilisation_from_cancer_diagnosis_days), 
  immobilisation = tdc(tu_2nd_immobilisation_from_cancer_diagnosis_days), 
  immobilisation = tdc(tu_3rd_immobilisation_from_cancer_diagnosis_days), 
  recent_major_surgery = tdc(tu_1st_recent_major_surgery_from_cancer_diagnosis_days),
  recent_major_surgery = tdc(tu_2nd_recent_major_surgery_from_cancer_diagnosis_days),
  recent_major_surgery = tdc(tu_3rd_recent_major_surgery_from_cancer_diagnosis_days),
  thromboprophylaxis_hospital = tdc(tu_1st_thromboprophylaxis_hospital_from_cancer_diagnosis_days), 
  erythropoietin_tx = tdc(tu_1st_erythropoietin_tx_from_cancer_diagnosis_days),
  erythropoietin_tx = tdc(tu_2nd_erythropoietin_tx_from_cancer_diagnosis_days),
  erythropoietin_tx = tdc(tu_3rd_erythropoietin_tx_from_cancer_diagnosis_days)
  )

Survival curves

We will fit several survival curves using the Kaplan-Meier estimator.

VTE

# Fit Kaplan-Meier estimator 
km_fit_vte = survfit(
  Surv(time = tstart, time2 = tstop, event = death) ~ 
    vte_event + 
    cluster(id), # ensures that the survival estimates account for the correlation within individuals, providing more robust results in the presence of repeated measurements
  data = survival_vte_tmerge)

km_fit_vte
## Call: survfit(formula = Surv(time = tstart, time2 = tstop, event = death) ~ 
##     vte_event + cluster(id), data = survival_vte_tmerge)
## 
##             records n.max n.start events median 0.95LCL 0.95UCL
## vte_event=0     650   369     369     97     NA      NA      NA
## vte_event=1     190    50      21     52    274     177     391

This output contains several items:

  • records: The total number of records in each group (vte_event=0 and vte_event=1).
  • n.max: The maximum number of subjects at risk at any time during the observation period.
  • n.start: The number of subjects at the start of the observation period.
  • events: The number of events (deaths) observed.
  • median: The median survival time (days).
  • 0.95LCL and 0.95UCL: The lower and upper 95% confidence limits for the median survival time (days).
km_summary = function(km_survfit) {
  
  # Obtain summary from KM estimator
  summary_fit = summary(km_survfit)

  formatted_summary = data.frame(
    time = summary_fit$time,
    n_risk = summary_fit$n.risk,
    n_event = summary_fit$n.event,
    survival = summary_fit$surv,
    std_error = summary_fit$std.err,
    lower_95CI = summary_fit$lower,
    upper_95CI = summary_fit$upper
  )
  
  # Show table with summary
  formatted_summary %>%
    kbl(caption = "Survival Analysis Summary") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = FALSE) %>%
    column_spec(ncol(formatted_summary))
}
# Show table with summary with all time points from Kaplan-Meier estimation
km_summary(km_fit_vte)
Survival Analysis Summary
time n_risk n_event survival std_error lower_95CI upper_95CI
26.0 358 1 0.9972067 0.0027894 0.9917546 1.0000000
27.0 357 1 0.9944134 0.0039393 0.9867225 1.0000000
40.0 355 1 0.9916122 0.0048223 0.9822055 1.0000000
47.0 353 1 0.9888031 0.0055671 0.9779519 0.9997748
50.0 351 1 0.9859860 0.0062233 0.9738638 0.9982592
58.0 350 1 0.9831689 0.0068133 0.9699053 0.9966139
68.0 349 2 0.9775347 0.0078532 0.9622634 0.9930485
84.0 345 1 0.9747013 0.0083259 0.9585187 0.9911571
87.0 344 1 0.9718679 0.0087706 0.9548290 0.9892108
99.0 338 1 0.9689925 0.0092039 0.9511201 0.9872008
101.0 336 1 0.9661086 0.0096177 0.9474410 0.9851441
115.0 326 1 0.9631451 0.0100344 0.9436775 0.9830143
124.0 318 1 0.9601163 0.0104499 0.9398517 0.9808179
127.0 317 1 0.9570876 0.0108470 0.9360622 0.9785852
137.0 314 1 0.9540395 0.0112326 0.9322761 0.9763109
141.0 312 1 0.9509817 0.0116053 0.9285056 0.9740019
143.0 311 2 0.9448661 0.0123100 0.9210443 0.9693040
149.0 306 1 0.9417783 0.0126512 0.9173060 0.9669034
152.0 304 1 0.9386803 0.0129833 0.9135753 0.9644752
153.0 303 1 0.9355824 0.0133049 0.9098653 0.9620263
154.0 302 1 0.9324844 0.0136168 0.9061743 0.9595583
160.0 298 1 0.9293553 0.0139260 0.9024578 0.9570544
162.0 297 2 0.9230970 0.0145183 0.8950758 0.9519954
175.0 295 1 0.9199678 0.0148024 0.8914083 0.9494424
179.0 292 1 0.9168173 0.0150833 0.8877261 0.9468618
183.0 290 1 0.9136558 0.0153590 0.8840432 0.9442604
184.0 289 1 0.9104944 0.0156279 0.8803738 0.9416455
194.0 285 1 0.9072997 0.0158962 0.8766725 0.9389968
203.0 284 1 0.9041049 0.0161581 0.8729839 0.9363354
205.0 283 1 0.9009102 0.0164138 0.8693074 0.9336619
208.0 282 1 0.8977155 0.0166636 0.8656425 0.9309769
216.0 280 1 0.8945094 0.0169097 0.8619735 0.9282734
218.0 279 1 0.8913033 0.0171503 0.8583152 0.9255592
222.0 277 1 0.8880856 0.0173877 0.8546520 0.9228270
237.0 274 1 0.8848444 0.0176237 0.8509681 0.9200692
242.0 273 1 0.8816032 0.0178547 0.8472941 0.9173016
247.0 272 1 0.8783620 0.0180808 0.8436295 0.9145244
256.0 269 1 0.8750967 0.0183061 0.8399430 0.9117217
266.0 266 1 0.8718069 0.0185305 0.8362338 0.9088932
267.0 264 1 0.8685046 0.0187523 0.8325176 0.9060471
268.0 263 1 0.8652023 0.0189695 0.8288102 0.9031923
278.0 260 1 0.8618746 0.0191862 0.8250788 0.9003113
279.0 259 1 0.8585469 0.0193986 0.8213559 0.8974218
280.0 258 1 0.8552192 0.0196067 0.8176412 0.8945241
285.0 257 1 0.8518915 0.0198108 0.8139345 0.8916185
287.0 256 1 0.8485638 0.0200110 0.8102356 0.8887051
294.0 254 1 0.8452230 0.0202091 0.8065275 0.8857749
297.0 253 1 0.8418822 0.0204035 0.8028269 0.8828373
304.0 250 1 0.8385146 0.0205979 0.7990999 0.8798735
305.0 249 1 0.8351471 0.0207887 0.7953800 0.8769024
308.0 248 1 0.8317796 0.0209758 0.7916672 0.8739244
311.0 247 2 0.8250445 0.0213398 0.7842618 0.8679480
313.0 245 2 0.8183095 0.0216905 0.7768823 0.8619457
316.0 243 1 0.8149419 0.0218611 0.7732019 0.8589352
324.0 240 1 0.8115463 0.0220321 0.7694930 0.8558980
333.0 239 1 0.8081507 0.0222000 0.7657901 0.8528547
342.0 238 1 0.8047552 0.0223649 0.7620931 0.8498054
343.0 237 1 0.8013596 0.0225269 0.7584020 0.8467504
352.0 234 1 0.7979350 0.0226894 0.7546810 0.8436679
353.0 233 1 0.7945103 0.0228490 0.7509658 0.8405798
355.0 232 1 0.7910857 0.0230057 0.7472563 0.8374860
356.0 231 3 0.7808119 0.0234592 0.7361603 0.8281718
362.0 228 1 0.7773873 0.0236049 0.7324722 0.8250565
373.0 226 1 0.7739475 0.0237498 0.7287709 0.8219246
374.0 225 1 0.7705077 0.0238920 0.7250748 0.8187875
381.0 221 1 0.7670213 0.0240370 0.7213274 0.8156098
385.0 220 1 0.7635348 0.0241792 0.7175851 0.8124268
386.0 219 1 0.7600484 0.0243189 0.7138480 0.8092388
408.0 218 1 0.7565619 0.0244560 0.7101159 0.8060457
415.0 217 1 0.7530754 0.0245905 0.7063888 0.8028477
418.0 216 1 0.7495890 0.0247226 0.7026665 0.7996448
419.0 215 1 0.7461025 0.0248522 0.6989490 0.7964372
420.0 214 1 0.7426161 0.0249795 0.6952362 0.7932248
426.0 213 2 0.7356431 0.0252268 0.6878244 0.7867863
427.0 211 1 0.7321567 0.0253471 0.6841253 0.7835602
468.0 203 1 0.7285500 0.0254775 0.6802878 0.7802360
478.0 201 2 0.7213007 0.0257345 0.6725852 0.7735448
506.0 197 1 0.7176393 0.0258630 0.6686976 0.7701631
514.0 196 1 0.7139779 0.0259889 0.6648151 0.7667763
542.0 178 1 0.7099668 0.0261506 0.6605188 0.7631165
547.0 175 1 0.7059098 0.0263140 0.6561743 0.7594151
553.0 171 1 0.7017817 0.0264820 0.6517509 0.7556530
554.0 168 1 0.6976044 0.0266518 0.6472757 0.7518464
560.0 162 1 0.6932982 0.0268329 0.6426520 0.7479358
561.0 158 1 0.6889103 0.0270194 0.6379374 0.7439560
586.0 132 1 0.6836912 0.0273142 0.6321988 0.7393777
595.0 110 1 0.6774759 0.0277640 0.6251875 0.7341374
614.0 78 1 0.6687903 0.0287345 0.6147777 0.7275482
45.0 34 1 0.9705882 0.0289760 0.9154259 1.0000000
50.0 33 1 0.9411765 0.0403526 0.8653187 1.0000000
60.0 32 1 0.9117647 0.0486433 0.8212409 1.0000000
66.0 31 1 0.8823529 0.0552551 0.7804373 0.9975775
67.0 30 1 0.8529412 0.0607387 0.7418297 0.9806949
76.0 30 1 0.8245098 0.0649678 0.7065206 0.9622033
79.0 29 1 0.7960784 0.0686098 0.6723499 0.9425760
81.0 29 1 0.7686275 0.0713975 0.6406902 0.9221120
102.5 35 1 0.7466667 0.0741286 0.6146388 0.9070549
108.0 35 1 0.7253333 0.0745794 0.5929477 0.8872763
136.0 46 1 0.7095652 0.0755026 0.5759949 0.8741098
142.0 46 1 0.6941399 0.0747986 0.5619840 0.8573736
155.0 49 1 0.6799738 0.0752883 0.5473248 0.8447714
158.0 48 1 0.6658076 0.0744211 0.5348169 0.8288814
159.0 49 1 0.6522197 0.0746348 0.5211818 0.8162038
175.5 50 1 0.6391753 0.0750670 0.5077523 0.8046151
177.0 49 1 0.6261309 0.0752042 0.4947980 0.7923232
178.0 48 1 0.6130866 0.0741947 0.4836274 0.7771999
196.0 50 1 0.6008248 0.0745133 0.4711760 0.7661477
216.0 49 1 0.5885631 0.0746233 0.4590607 0.7545985
231.0 48 1 0.5763014 0.0743636 0.4475216 0.7421390
245.0 49 1 0.5645401 0.0732355 0.4377961 0.7279771
247.0 48 1 0.5527789 0.0721052 0.4280747 0.7138111
253.0 48 1 0.5412626 0.0709484 0.4186326 0.6998147
254.0 47 1 0.5297464 0.0697898 0.4091938 0.6858150
263.0 47 1 0.5184752 0.0695743 0.3985704 0.6744518
267.0 48 1 0.5076736 0.0697549 0.3878186 0.6645697
274.0 47 1 0.4968721 0.0692230 0.3781440 0.6528779
275.0 47 1 0.4863003 0.0693049 0.3677866 0.6430033
277.0 46 1 0.4757286 0.0689086 0.3581491 0.6319091
285.0 45 1 0.4651568 0.0676756 0.3497504 0.6186436
288.0 45 1 0.4548200 0.0664309 0.3415964 0.6055722
293.0 44 1 0.4444832 0.0651852 0.3334443 0.5924988
315.0 43 1 0.4341464 0.0646618 0.3242337 0.5813185
317.0 42 1 0.4238096 0.0646638 0.3142654 0.5715378
323.0 42 1 0.4137189 0.0633933 0.3063920 0.5586416
360.0 41 1 0.4036281 0.0621022 0.2985490 0.5456916
365.0 40 1 0.3935374 0.0622261 0.2886650 0.5365102
373.0 39 1 0.3834467 0.0623399 0.2788167 0.5273408
375.0 38 1 0.3733560 0.0610309 0.2710072 0.5143580
379.0 37 1 0.3632653 0.0597201 0.2632013 0.5013718
391.0 37 1 0.3534474 0.0593381 0.2543442 0.4911652
423.0 36 2 0.3338114 0.0589956 0.2360834 0.4719945
464.0 38 1 0.3250269 0.0580308 0.2290574 0.4612052
480.0 35 1 0.3157404 0.0565772 0.2222308 0.4485966
483.0 34 1 0.3064539 0.0565645 0.2134285 0.4400255
498.0 33 1 0.2971674 0.0551030 0.2066165 0.4274028
504.0 32 1 0.2878809 0.0543287 0.1988725 0.4167265
561.0 30 1 0.2782849 0.0533702 0.1910926 0.4052616
562.0 29 1 0.2686889 0.0534381 0.1819528 0.3967717
722.0 3 1 0.1791259 0.0821321 0.0729238 0.4399951
get_robscore_pval = function(data, covariates, tstart = "tstart", tstop = "tstop", event = "death", cluster = "id", strata = NULL) {
  
  # Fit CPH model to extract robust score test p-value, since it is the best
  # way of dealing with left- and right-censored data
  
  # Construct formula
  covariates = paste(covariates, collapse = " + ")
    
  # Create formula for CPH model
  if (!is.null(strata)) {
    formula_string = paste0("Surv(", tstart, ", ", tstop, ", ", event, ") ~ ", covariates, " + strata(", strata, ") + cluster(", cluster, ")")
  } else {
    formula_string = paste0("Surv(", tstart, ", ", tstop, ", ", event, ") ~ ", covariates, " + cluster(", cluster, ")")
  }
  
  # Convert to formula object
  formula_string = as.formula(formula_string)
    
  # Train CPH model
  cph_fit = coxph(formula_string,data = data)

  # Get p-value from robust score test
  cph_tests_summary = summary(cph_fit)
  robscore_pval = cph_tests_summary$robscore[["pvalue"]]

  return(robscore_pval)
}
# We perform the log-rank test separately, using coxph(), because it can handle left- and right-censored data
# Later, we need to manually add the p-value into the KM curves
km_pval = get_robscore_pval(data = survival_vte_tmerge, covariates = c("vte_event"))
# jpeg("../graphs/KM_curve_VTE.jpeg", width = 6.5, height = 4, units = 'in', res = 700)

km_plot_vte = 
  ggsurvplot(
    fit = km_fit_vte,
    data = survival_vte_tmerge,
    xscale = 30,
    break.time.by = 60,
    censor = FALSE,
    # pval = TRUE, 
    # pval.method = TRUE,
     #pval.size = 4, 
    conf.int = TRUE,
    conf.int.alpha = 0.15,
    xlim = c(0, 540),
    surv.plot.height = 1.25, 
    tables.height = 0.25,
    legend.title = " ",
    legend.labs = c("No VTE", "VTE"),
    risk.table = "abs_pct", 
    risk.table.height = 0.3, 
    fontsize = 3,
    risk.table.col = "strata", 
    risk.table.y.text.col = TRUE,
    # linetype = "strata", 
    surv.median.line = "hv",
    xlab = "Time to death (months)",
    ylab = "Overall Survival Probability",
    ggtheme = theme_bw(),
    palette = c("black", "red")
  )

# Add p-value
km_plot_vte$plot = km_plot_vte$plot + 
  annotate(
    geom = "text", 
    x = 26,
    y = 0.3,
    label = "Log-rank",
    color = "black") + 
  annotate(
    geom = "text", 
    x = 28,
    y = 0.2,
    label = as.character(format(
      km_pval, 
      scientific = TRUE, 
      digits = 1)),
    color = "black")

# Customize risk table title 
km_plot_vte$table = 
  km_plot_vte$table +
  theme(plot.title = element_text(size = 10))

km_plot_vte

# Save plot
# ggsave(
#   filename = "../graphs/KM_curve_VTE.jpeg", 
#   # width = 6.5, 
#   # height = 4, 
#   units = 'in', 
#   dpi = 700
#   )

# dev.off()

No VTE vs single VTE vs VTE recurrence

VTE recurrence is also a time-dependent covariate.

# KM estimation
km_fit_vte_recurrence = survfit(Surv(
  time = tstart, 
  time2 = tstop,
  event = death) ~ vte_recurrence + cluster(id), 
    id = id,
  data = survival_vte_tmerge)

km_fit_vte_recurrence
## Call: survfit(formula = Surv(time = tstart, time2 = tstop, event = death) ~ 
##     vte_recurrence + cluster(id), data = survival_vte_tmerge, 
##     id = id)
## 
##                                  records   n events median 0.95LCL 0.95UCL
## vte_recurrence=No VTE                558 302     97     NA      NA      NA
## vte_recurrence=No VTE recurrence     233  77     43    483     373      NA
## vte_recurrence=VTE recurrence         49  11      9    293     254      NA
# Get ...
km_pval_vte_recurrence = get_robscore_pval(survival_vte_tmerge, covariates = c("vte_recurrence"))
# Plot KM curve of patients with no CAT, with a single CAT and CAT recurrence
km_plot_vte_recurrence = 
  ggsurvplot(
    fit = km_fit_vte_recurrence,
    data = survival_vte_tmerge,
    xscale = 30,
    break.time.by = 60,
    censor = FALSE,
    conf.int = FALSE,
    xlim = c(0, 540),
    # surv.plot.height = 1.25, 
    # tables.height = 0.25,
    legend.title = " ",
    legend.labs = c("No CAT", "CAT w/o recurrence", "CAT recurrence"),
    risk.table = "abs_pct", 
    risk.table.height = 0.3, 
    fontsize = 3,
    risk.table.col = "strata", 
    tables.y.text = FALSE,
    #tables.y.text.col = TRUE,
    risk.table.y.text.col = TRUE,
    #linetype = "strata", 
    surv.median.line = "hv",
    xlab = "Time (months)",
    ylab = "OS Probability",
    ggtheme = theme_bw(),
    palette = c("black", "red", "#FF6600")
    )
## Warning in geom_segment(aes(x = 0, y = max(y2), xend = max(x1), yend = max(y2)), : All aesthetics have length 1, but the data has 2 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
# Count number of patients of each category
n_vte_no_recurrence = sum(oncoth1_data$vte_recurrence == "No VTE recurrence")
n_vte_recurrence = sum(oncoth1_data$vte_recurrence == "VTE recurrence")


# Customize risk table title 
km_plot_vte_recurrence$table = 
  km_plot_vte_recurrence$table +
  theme(plot.title = element_text(size = 10), legend.position = "none") 

# Add p-value and n of patient subgroups
km_plot_vte_recurrence$plot = 
  km_plot_vte_recurrence$plot + 
  # Log-rank p-value
  annotate(
    geom = "text", 
    x = 26,
    y = 0.3,
    label = "Log-rank",
    color = "black") + 
  annotate(
    geom = "text", 
    x = 28,
    y = 0.2,
    label = if (km_pval_vte_recurrence < 0.0001) {"< 0.0001"} else {km_pval_vte_recurrence},
    color = "black") + 
  # n subgroups
  annotate(
    geom = "text",
    size = 3.5,
    x = 530, 
    y = 0.99, 
    label = "n=302", 
    color = "black") + 
  annotate(
    geom = "text",
    size = 3.5,
    x = 530, 
    y = 0.91, 
    label = paste0("n=", n_vte_no_recurrence), 
    color = "red") +
  annotate(
    geom = "text",
    size = 3.5,
    x = 530, 
    y = 0.83, 
    label = paste0("n=", n_vte_recurrence), 
    color = "#FF6600")

km_plot_vte_recurrence
## Warning in geom_segment(aes(x = 0, y = max(y2), xend = max(x1), yend = max(y2)), : All aesthetics have length 1, but the data has 2 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
## All aesthetics have length 1, but the data has 2 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
## All aesthetics have length 1, but the data has 2 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

# # Save plot
# ggsave(
#   filename = "../graphs/KM_curve_CAT_recurrence.jpeg",
#   plot = km_plot_vte_recurrence$plot,
#   width = 7, 
#   height = 4,
#   units = "in"
#   )

# dev.off()
# Save plot with table

# Combine the main plot and the cumulative events table
KM_curve_CAT_recurrence_combined_plot <- cowplot::plot_grid(
  km_plot_vte_recurrence$plot,
  km_plot_vte_recurrence$table,
  ncol = 1, 
  rel_heights = c(1.5, 0.7)  # Adjust heights to fit both plots
)
## Warning in geom_segment(aes(x = 0, y = max(y2), xend = max(x1), yend = max(y2)), : All aesthetics have length 1, but the data has 2 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
# Save plot
ggsave(
  filename = "../graphs/KM_curve_CAT_recurrence.jpeg",
  plot = KM_curve_CAT_recurrence_combined_plot, 
  width = 7, 
  height = 4,
  units = "in"
  )
# Train CPH model to assess weight of VTE recurrence
# Adjust by age, sex, cancer type and stage
cph_vte_recurrence = coxph(Surv(
  time = tstart, 
  time2 = tstop, 
  event = death) ~ 
    age_when_cancer_dx + 
    gender +
    primary_tumor_simplified +
    tnm_stage_grouped +
    vte_event:vte_recurrence + # VTE recurrence only makes sense if there is VTE
    cluster(id),
  id = id,
  data = survival_vte_tmerge
  ) 
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Loglik converged before variable 8 ; beta may be infinite.
# Test PH assumption
cox.zph(cph_vte_recurrence)
##                           chisq df    p
## age_when_cancer_dx        0.379  1 0.54
## gender                    0.392  1 0.53
## primary_tumor_simplified  6.123  3 0.11
## tnm_stage_grouped         2.937  2 0.23
## vte_event:vte_recurrence  4.371  3 0.22
## GLOBAL                   16.356 10 0.09
# Inspect plot for violation of PH assumption
ggcoxzph(cox.zph(cph_vte_recurrence))

# Show betas and p-values
cph_vte_recurrence %>% 
  gtsummary::tbl_regression(exp = TRUE)
Characteristic HR1 95% CI1 p-value
age_when_cancer_dx 1.02 1.00, 1.04 0.043
gender


    Male
    Female 0.84 0.57, 1.23 0.4
primary_tumor_simplified


    Colorectal
    NSCLC 3.07 1.85, 5.10 <0.001
    Oesophago-gastric 2.94 1.74, 4.96 <0.001
    Pancreatic 4.93 2.90, 8.38 <0.001
tnm_stage_grouped


    I-II
    III 2.79 1.27, 6.12 0.010
    IV 9.17 4.45, 18.9 <0.001
vte_event * vte_recurrence


    vte_event * No VTE 0.00 0.00, 0.00 <0.001
    vte_event * No VTE recurrence 2.43 1.60, 3.69 <0.001
    vte_event * VTE recurrence 2.62 1.54, 4.47 <0.001
1 HR = Hazard Ratio, CI = Confidence Interval
# C-index 
concordance(cph_vte_recurrence)
## Call:
## concordance.coxph(object = cph_vte_recurrence)
## 
## n= 840 
## Concordance= 0.7912 se= 0.0168
## concordant discordant     tied.x     tied.y    tied.xy 
##      35164       9264         48         18          0
# 18-month OS rate
summary(km_fit_vte_recurrence, times = 18*30.25)
## Call: survfit(formula = Surv(time = tstart, time2 = tstop, event = death) ~ 
##     vte_recurrence + cluster(id), data = survival_vte_tmerge, 
##     id = id)
## 
##                 vte_recurrence=No VTE 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##     544.5000     174.0000      89.0000       0.6944       0.0271       0.6433 
## upper 95% CI 
##       0.7496 
## 
##                 vte_recurrence=No VTE recurrence 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##      544.500       32.000       40.000        0.465        0.058        0.364 
## upper 95% CI 
##        0.594 
## 
##                 vte_recurrence=VTE recurrence 
##         time       n.risk      n.event     survival      std.err lower 95% CI 
##     544.5000       2.0000       9.0000       0.1818       0.1163       0.0519 
## upper 95% CI 
##       0.6369

Multivariate CPH regression

Some pointers:

Prior selection of features

Linear models do not work well when there is multicollinearity. Also, given the small sample size, we should limit our model to have a pool of features from which then apply automatic feature selection.

Variance per feature

# Convert categorical variables into dummies (one-hot encoding)
survival_vte_tmerge_dummies = survival_vte_tmerge %>%
  # Remove useless variables
  select(-c(id, tstart, tstop, contains("tu_"))) %>%
  # Convert KRS to numeric
  mutate(khorana_risk_score_imp = as.numeric(khorana_risk_score_imp)) %>%
  # Create dummies for all categorical variables (character and factors)
  fastDummies::dummy_cols(., remove_first_dummy = TRUE, remove_selected_columns = TRUE)
# Variance per feature
variance_per_feature = sapply(survival_vte_tmerge_dummies, function (x) var(x, na.rm = TRUE))
variance_per_feature = variance_per_feature[order(variance_per_feature)] 

# Threshold
variance_threshold = 0.1 # 0.05  

# Variables with less than threshold variance, ordered from less to more
variance_per_feature[order(variance_per_feature[variance_per_feature < variance_threshold])]
##                         oral_contraceptive_tx_Yes 
##                                       0.000000000 
##                          anticoag_tx_apixaban_Yes 
##                                       0.000000000 
##                        anticoag_tx_dabigatran_Yes 
##                                       0.000000000 
##                       anticoag_tx_other_drugs_Yes 
##                                       0.000000000 
##                  ethnicity_Black/African American 
##                                       0.001190476 
##                                   krs_category_NA 
##                                       0.001190476 
##                               metastasis_skin_Yes 
##                                       0.001190476 
##                      anticoag_tx_other_causes_Yes 
##                                       0.001190476 
##                           ethnicity_Sephardic Jew 
##                                       0.002378115 
##                                 pregnancy_imp_Yes 
##                                       0.002378115 
##                                   n_stage_imp_N3b 
##                                       0.003562915 
##                                    n_metastases_5 
##                                       0.003562915 
##     family_background_vte_all_relatives_Offspring 
##                                       0.003562915 
##        family_background_vte_all_relatives_Others 
##                                       0.003562915 
##               family_background_VTE_offspring_Yes 
##                                       0.003562915 
##                                  n_previous_ate_3 
##                                       0.003562915 
##                          oral_contraceptive_tx_NA 
##                                       0.004744878 
##                       metastasis_soft_tissues_Yes 
##                                       0.004744878 
##                         family_background_vte_n_2 
##                                       0.004744878 
## family_background_vte_all_relatives_Two relatives 
##                                       0.004744878 
##                           previous_ate_type_Other 
##                                       0.004744878 
##                              other_thromb_ate_Yes 
##                                       0.004744878 
##                       anticoag_tx_cardiopathy_Yes 
##                                       0.004744878 
##                               anticoag_tx_vte_Yes 
##                                       0.006992730 
##      family_background_vte_all_relatives_Siblings 
##                                       0.007100289 
##                       anticoag_tx_vit_k_antag_Yes 
##                                       0.007100289 
##         family_background_VTE_other_relatives_Yes 
##                                       0.007100289 
##                             previous_ate_type_TIA 
##                                       0.007100289 
##                            tnm_stage_detailed_IIc 
##                                       0.008273739 
##                family_background_VTE_siblings_Yes 
##                                       0.008273739 
##                                  n_previous_ate_2 
##                                       0.008273739 
##                                 n_antiaggreg_tx_2 
##                                       0.008273739 
##                antiaggreg_tx_reason_Two ATE types 
##                                       0.008273739 
##                        anticoag_tx_lmwh_dosage_NA 
##                                       0.009444350 
##                                   t_stage_imp_T1a 
##                                       0.010612123 
##                              anticoag_tx_lmwh_Yes 
##                                       0.011140825 
##                             tnm_stage_detailed_Ib 
##                                       0.011777059 
##                     previous_ate_type_Two or more 
##                                       0.011777059 
##                             anticoag_tx_reason_NA 
##                                       0.011777059 
##                               hormone_therapy_Yes 
##                                       0.012939157 
##                                   t_stage_imp_T1b 
##                                       0.012939157 
##        family_background_vte_all_relatives_Father 
##                                       0.012939157 
##              major_surgery_in_medical_history_Yes 
##                                       0.014098416 
##                              metastasis_brain_Yes 
##                                       0.014098416 
##                  family_background_VTE_father_Yes 
##                                       0.014098416 
##                                       tia_ate_Yes 
##                                       0.015254839 
##                                   n_stage_imp_N1c 
##                                       0.016408423 
##                          anticoag_tx_vte_drugs_NA 
##                                       0.016408423 
##                         anticoag_tx_currently_Yes 
##                                       0.016408423 
##                     metastasis_adrenal_glands_Yes 
##                                       0.017559169 
##        family_background_vte_all_relatives_Mother 
##                                       0.017559169 
##                                   t_stage_imp_T2a 
##                                       0.018707078 
##                  family_background_VTE_mother_Yes 
##                                       0.020994381 
##                             previous_ate_type_PAD 
##                                       0.020994381 
##                                   n_stage_imp_N2a 
##                                       0.022133776 
##                             previous_ate_type_CVA 
##                                       0.022133776 
##                          ethnicity_Latin-American 
##                                       0.023270333 
##                                    n_metastases_4 
##                                       0.023270333 
##                                       cva_ate_Yes 
##                                       0.023270333 
##                          antiaggreg_tx_reason_CVA 
##                                       0.024404052 
##                 chronic_cardiac_insufficiency_Yes 
##                                       0.024404052 
##                       ethnicity_grouped_Caucasian 
##                                       0.026662977 
##                       thromboprophylaxis_hospital 
##                                       0.027788183 
##                                   t_stage_imp_T2b 
##                                       0.027788183 
##                                       pad_ate_Yes 
##                                       0.027788183 
##                             antiaggreg_tx_cva_Yes 
##                                       0.027788183 
##                                   n_stage_imp_N1b 
##                                       0.028910551 
##             menopausal_status_imp_Peri-menopausal 
##                                       0.031146773 
##                              other_metastases_Yes 
##                                       0.031146773 
##                                  previous_vte_Yes 
##                                       0.031146773 
##                             metastasis_pleura_Yes 
##                                       0.032260628 
##           previous_ate_type_Myocardial infarction 
##                                       0.033371644 
##                                    t_stage_imp_T1 
##                                       0.035585164 
##              menopausal_status_imp_Pre-menopausal 
##                                       0.037787332 
##                                 erythropoietin_tx 
##                                       0.039978149 
##                                 body_surface_area 
##                                       0.040845545 
##                         family_background_vte_n_1 
##                                       0.043243090 
##                     myocardial_infarction_ate_Yes 
##                                       0.043243090 
##                                   n_stage_imp_N1a 
##                                       0.044325728 
##                         family_background_vte_Yes 
##                                       0.047556615 
##                  antiaggreg_tx_reason_Cardiopathy 
##                                       0.047556615 
##                            tnm_stage_detailed_III 
##                                       0.050761962 
##                                    n_metastases_3 
##                                       0.050761962 
##                              metastasis_bones_Yes 
##                                       0.050761962 
##                           renal_insufficiency_Yes 
##                                       0.051824735 
##                                   n_stage_imp_N2b 
##                                       0.052884670 
##                     vte_recurrence_VTE recurrence 
##                                       0.054996027 
##                     antiaggreg_tx_cardiopathy_Yes 
##                                       0.054996027 
##                           tnm_stage_detailed_IIIc 
##                                       0.058141779 
##                            tnm_stage_detailed_IVb 
##                                       0.058141779 
##                                t_stage_grouped_T1 
##                                       0.060871126 
##                                   t_stage_imp_T4b 
##                                       0.062296385 
##                       antiaggreg_tx_reason_Others 
##                                       0.062296385 
##                    antiaggreg_tx_other_causes_Yes 
##                                       0.066405585 
##                            tnm_stage_detailed_IIa 
##                                       0.068443158 
##       performance_status_category_corrected_imp_2 
##                                       0.070469380 
##                            tnm_stage_detailed_IVa 
##                                       0.072484250 
##                                    t_stage_imp_T2 
##                                       0.077471763 
##                                t_stage_grouped_NA 
##                                       0.077471763 
##                           tnm_stage_detailed_IIIa 
##                                       0.078460753 
##                     histology_type_imp_Epidermoid 
##                                       0.080430217 
##                                  n_previous_ate_1 
##                                       0.082388331 
##                            tnm_stage_detailed_IIb 
##                                       0.084335093 
##                                       m_stage_M1a 
##                                       0.086270503 
##                                  previous_ate_Yes 
##                                       0.092008627 
##                                n_stage_grouped_N3 
##                                       0.092687623 
##       age_when_cancer_dx_50_Older or 50 years old 
##                                       0.094839378 
##                                n_stage_grouped_NA 
##                                       0.099500539

Correlation plot

# Correlation matrix
oncoth1_correlations_vars = survival_vte_tmerge_dummies %>%
  # Select relevant variables
  select(
    age_when_cancer_dx, 
    # `age_quartiles_61-69`,
    # `age_quartiles_70-93`,
    # `age_when_cancer_dx_50_Older or 50 years old`,
    # `age_when_cancer_dx_65_Older or 65 years old`,
    weight, 
    height, 
    body_surface_area, 
    bmi_value, 
    khorana_risk_score_imp, 
    death, 
    vte_event, 
    blood_transfusion, 
    immobilisation, 
    recent_major_surgery, 
    thromboprophylaxis_hospital, 
    erythropoietin_tx, 
    gender_Female, 
    `menopausal_status_imp_Pre-menopausal`,
    `menopausal_status_imp_Peri-menopausal`, 
    `menopausal_status_imp_Post-menopausal`, 
    performance_status_category_corrected_imp_1, 
    performance_status_category_corrected_imp_2, 
    diabetes_mellitus_Yes, 
    dyslipidemia_Yes, 
    `tobacco_use_imp_Former smoker`, 
    `tobacco_use_imp_Active smoker`, 
    `tobacco_use_imp_grouped_Never`, 
    copd_imp_Yes, 
    arterial_hypertension_Yes, 
    chronic_cardiac_insufficiency_Yes, 
    renal_insufficiency_Yes, 
    previous_onco_surgery_Yes, 
    tumor_surgically_removed_Yes, 
    primary_tumor_simplified_NSCLC, 
    `primary_tumor_simplified_Oesophago-gastric`, 
    primary_tumor_simplified_Pancreatic, 
    tnm_stage_II, 
    tnm_stage_III, 
    tnm_stage_IV, 
    # metastasis_bones_Yes,
    # metastasis_liver_Yes,
    # metastasis_lung_Yes,
    # metastasis_lymph_nodes_Yes,
    # metastasis_peritoneum_Yes,
    histology_type_imp_Adenocarcinoma, 
    histology_type_imp_Epidermoid, 
    mucinous_histology_imp_Yes, 
    `grade_histological_differentiation_imp_Moderately differentiated`,
    `grade_histological_differentiation_imp_Poorly differentiated`, 
    catheter_device_imp_Yes, 
    venous_insufficiency_imp_Yes, 
    # n_comorbidities_1, 
    # `n_comorbidities_2+`, 
    family_background_vte_Yes, 
    previous_vte_Yes, 
    previous_ate_Yes,
    anticoag_tx_currently_Yes,
    antiaggreg_tx_currently_Yes
    ) %>%
  cor(.)
# Heatmap and dendrogram
heatmap(oncoth1_correlations_vars)

# Interactive heatmap
plot_ly(
  data = reshape2::melt(oncoth1_correlations_vars),
  x = ~Var1,
  y = ~Var2,
  z = ~value,
  type = "heatmap",
  colors = colorRamp(c("blue", "white", "red")),
  colorbar = list(title = "Correlation"),
  showscale = TRUE
) %>%
  layout(
    title = "Interactive Heatmap of Correlations in ONCOTHROMB1",
    xaxis = list(title = "Variables"),
    yaxis = list(title = "Variables")
  )
  • Menopausal status is correlated to age and sex. We remove it from the training of the model.
  • Height, weight and body surface area are correlated with sex.
  • Logically, BMI is correlated to body surface area.
  • Having more than 2 comorbidities is correlated with diabetes mellitus, dyslipidemia and arterial hypertension; and, to a lesser degree, with COPD, chronic cardiac insufficiency and renal insufficiency.
  • Previous ATE is correlated with the intake of antiaggregants, and slightly with arterial hypertension and chronic cardiac insufficiency.
  • Central venous catheter is negatively correlated to NSCLC.
  • Adenocarcinoma and epidermoid types are correlated to NSCLC.
  • Grade 2 of histological differentitation is negatively correlated to NSCLC.
  • Stage IV is very correlated to metastases locations (peritoneum, lymph nodes, lung, liver, bones…).
  • Khorana Risk Score is correlated to pancreatic and esophago-gastric cancers.
  • Blood transfusion is slightly correlated to pancreatic cancer. Also, somewhat correlated to EPO.

Association between categorical variables using Cramer’s V

# Do correlation between CVC and tumor surgically removed with primary tumor

# Calculate Cramér's V
# Close to 0: Weak or no association.
# Around 0.1: Small association.
# Around 0.3: Moderate association.
# Around 0.5: Strong association.
# Close to 1: Very strong or perfect association.

# Primary tumor and CVC
contingency_table1 <- table(
  oncoth1_data$primary_tumor_simplified,
  oncoth1_data$catheter_device
  )

cramers_v1 <- round(sqrt(chisq.test(contingency_table1)$statistic / sum(contingency_table1)), digits = 4)
print(paste("Cramer's V for cancer type and CVC =", cramers_v1))
## [1] "Cramer's V for cancer type and CVC = 0.4388"
# Primary tumor and tumor surgically removed
contingency_table2 <- table(
  oncoth1_data$primary_tumor_simplified,
  oncoth1_data$tumor_surgically_removed
  )

cramers_v2 <- round(sqrt(chisq.test(contingency_table2)$statistic / sum(contingency_table2)), digits = 4)
print(paste("Cramer's V for cancer type and surgical resection of tumor =", cramers_v2))
## [1] "Cramer's V for cancer type and surgical resection of tumor = 0.3397"
# Family background of VTE and venous insufficiency
contingency_table3 <- table(
  oncoth1_data$family_background_vte,
  oncoth1_data$catheter_device
  )

cramers_v3 <- round(sqrt(chisq.test(contingency_table3)$statistic / sum(contingency_table3)), digits = 4)
print(paste("Cramer's V for family background VTE and venous insufficiency =", cramers_v3))
## [1] "Cramer's V for family background VTE and venous insufficiency = 0.0076"

Stepwise feature selection CPH [careful with StepReg version]

StepReg package vignette:

https://cran.r-project.org/web/packages/StepReg/vignettes/StepReg.html

# Create dataset
survival_vte_tmerge_stepwise = survival_vte_tmerge %>%
  filter(!is.na(n_comorbidities))

# Do one-hot encoding with primary tumour
survival_vte_tmerge_stepwise = fastDummies::dummy_cols(survival_vte_tmerge_stepwise, select_columns = "primary_tumor_simplified")

# Rename with proper format
survival_vte_tmerge_stepwise %<>%
  rename(primary_tumor_simplified_Oesophagogastric = `primary_tumor_simplified_Oesophago-gastric`)

Training/test set partition and CV

See if there is consistency in variable selection and predictive capability of CV partitions, as well as with test set.

# Target distribution when event of interest is death
table(oncoth1_data$patient_status_at_end_study_simplified, useNA = "always")
## 
##   Alive    Dead Unknown    <NA> 
##     236     149       5       0
# Partition data with repeated rows per patient into training and test set

# Set seed
set.seed(seed) 

# Get unique IDs
unique_ids <- unique(survival_vte_tmerge_stepwise$id)

# Randomly assign each unique ID to either train or test
train_ids <- sample(unique_ids, size = floor(0.8 * length(unique_ids))) # 80% for training
test_ids <- setdiff(unique_ids, train_ids)  # Remaining IDs for testing

# Partition the data into train and test sets based on the assigned IDs
stepwise_training_set <- survival_vte_tmerge_stepwise %>% filter(id %in% train_ids)
stepwise_testing_set <- survival_vte_tmerge_stepwise %>% filter(id %in% test_ids)

# Check that IDs are not in training and test sets
stopifnot(
  identical(
    sort(unique(stepwise_training_set$id)), sort(unique(stepwise_testing_set$id))) == FALSE)
# Create folds for k-fold CV

# Set seed
set.seed(seed)

# Number of folds
k = 10

# Create a grouping variable for each ID
group_var <- as.numeric(factor(stepwise_training_set$id))

# Determine the unique IDs and shuffle them randomly to ensure a balanced distribution across the folds
# Get unique IDs
folds_unique_ids <- unique(group_var)

# Shuffle the unique IDs randomly
folds_shuffled_ids <- sample(folds_unique_ids)

# Divide the shuffled IDs into folds
cv_folds_index <- cut(folds_shuffled_ids, breaks = k, labels = FALSE)
# Save IDs in file for reproducibility
run_datetime = Sys.time()

# Function to append data to the file
append2file = function(file_name = "./ids_partition_run.csv", data) {
  write.table(
    x = data,
    file = file_name,
    append = TRUE,
    sep = "\t",
    col.names = !file.exists(file_name),
    row.names = FALSE,
    quote = FALSE
    )
}
# Set seed
set.seed(seed)

# Initialize a vector to store the evaluation results
stepwise_cv_index_valid <- vector("list", length = k)
stepwise_cv_vars_results <- vector("list", length = k)
stepwise_cv_cindex_results <- vector("list", length = k)
stepwise_cv_vif_results <- vector("list", length = k) # Inspection of GVIF in trained models is necessary to detect multicollinearity issues

for (i in 1:k) {
  # Get the validation IDs
  validation_ids <- unique_ids[cv_folds_index == i]
  
  # Get the validation fold
  validation_fold <- stepwise_training_set[stepwise_training_set$id %in% validation_ids, ]
  
  # Get the training fold
  training_fold <- stepwise_training_set[!(stepwise_training_set$id %in% validation_ids), ]
  
  # print("Training")
  # print(head(training_fold, n = k))
  # 
  # print("Validation")
  # print(head(validation_fold, n = k))
  
  # Check that IDs are not in training and test sets
  stopifnot(
    identical(
      sort(unique(training_fold$id)), sort(unique(validation_fold$id))) == FALSE)
  
  # Train your model on the training fold
  stepwise_model = StepReg::stepwiseCox(
    formula = Surv(time = tstart, time2 = tstop, event = death) ~
    vte_event +
    age_when_cancer_dx +
    gender +
    bmi_value +
    performance_status_category_corrected_imp + # Multicollinearity with cancer type, tobacco use, tumor resection in some folds, but models can be trained
    # diabetes_mellitus + # Confounder. We know it is associated with pancreatic cancer patients that undergo resection of pancreas
    dyslipidemia +
    # tobacco_use_imp + # Correlated to NSCLC. Multicollinearity if PS is also selected
    # copd_imp + # Correlated to tobacco use
    arterial_hypertension +
    # venous_insufficiency_imp +  # remove as suggested by Andrés Muñoz (bad definition of variable, information may refer to different conditions)
    # khorana_risk_score_imp + # Correlated to pancreatic and esophago-gastric cancers
    # two_groups_krs + # Correlated to pancreatic and esophago-gastric cancers
    # previous_onco_surgery + # Correlated to having had tumor resected
    tumor_surgically_removed + 
    primary_tumor_simplified +
    # tnm_stage_grouped +
    tnm_stage_two_groups +
    # histology_type_imp + # Correlated with NSCLC
    mucinous_histology_imp +
    # grade_histological_differentiation_imp + # violates PH assumption
    # catheter_device_imp + # quite correlated with tumor type, as Cramer's V indicates
    # family_background_vte + # High GVIF, multicollinearity with venous insufficiency in some folds. When selected, not clear if it is risk or protective factor, due to lack of information when recruiting patients
    previous_vte +
    previous_ate +
    anticoag_tx_currently +
    antiaggreg_tx_currently, 
    data = training_fold,
    include = "primary_tumor_simplified",
    selection = "bidirection",
    select = "AIC", 
    method = "efron"
    )
  
  # Save selected variables
  selected_variables = unname(unlist(stepwise_model$`Selected Varaibles`))
  # print(selected_variables)
  
  # Fit model with selected variables in this fold
  # Create formula
  fold_pcox_formula = as.formula(
    sprintf('%s ~ %s', 'Surv(time = tstart, time2 = tstop, event = death)',
            paste(selected_variables, collapse = " + ")))
  
  # Fit model
  fold_cph_model = coxph(
    formula = fold_pcox_formula, 
    id = id,
    data = validation_fold
  )
  
  # VIF
  vif_results = rms::vif(fold_cph_model)
  # print(vif_results)
  
  # Make predictions on the validation fold
  # The result is the hazard of suffering the event. It's difficult to interpret
  predictions <- predict(fold_cph_model, newdata = validation_fold)
  # print(predictions)
  
  # Evaluate the predictions
  # C-index
  fold_cindex <- concordance(fold_cph_model, newdata = validation_fold)
  # print(fold_cindex$concordance)
  
  # Store the evaluation result
  stepwise_cv_index_valid[[i]] = validation_ids
  stepwise_cv_vars_results[[i]] = selected_variables
  stepwise_cv_cindex_results[[i]] = fold_cindex$concordance
  stepwise_cv_vif_results[[i]] = as.data.frame(vif_results)

}
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Ran out of iterations and did not converge
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Loglik converged before variable 8 ; beta may be infinite.
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Ran out of iterations and did not converge
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Loglik converged before variable 6,8 ; beta may be infinite.
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Loglik converged before variable 7 ; beta may be infinite.
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Loglik converged before variable 5 ; beta may be infinite.
## Warning in agreg.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Ran out of iterations and did not converge
# Append info to file for experiment's reproducibility
  info_df = data.frame(
    "Run" = run_datetime,
    "Seed" = seed,
    "Train_IDs" = paste(train_ids, collapse = ", "),
    "Test_IDs" = paste(test_ids, collapse = ", "), 
    "IDs_folds_shuffled" = paste(folds_shuffled_ids, collapse = ", "), 
    "index_cv_folds" = paste(cv_folds_index, collapse = ", "), 
    "Validation_IDs" = paste(stepwise_cv_index_valid, collapse = ", ")
  )

  # Append to CSV file
  append2file(data = info_df)
# Variable selection frequency across 10-fold CV
stepwise_var_selection_freq = as.data.frame(table(unlist(stepwise_cv_vars_results)))
stepwise_var_selection_freq %<>% arrange(desc(Freq))
knitr::kable(stepwise_var_selection_freq)
Var1 Freq
performance_status_category_corrected_imp 10
primary_tumor_simplified 10
tnm_stage_two_groups 10
tumor_surgically_removed 10
vte_event 10
mucinous_histology_imp 6
age_when_cancer_dx 1
previous_vte 1
# Mean C-index across 10-fold CV
mean(unlist(stepwise_cv_cindex_results))
## [1] 0.86057
# Final model with most frequently selected variables
stepwise_cv_cph_final_model = coxph(
  Surv(time = tstart, 
       time2 = tstop, 
       event = death) ~ 
    cluster(id) + 
    performance_status_category_corrected_imp +
    primary_tumor_simplified +
    tnm_stage_two_groups +
    tumor_surgically_removed +
    vte_event +
    mucinous_histology_imp, # after removing venous insufficiency
    # vte_event + 
    # performance_status_category_corrected_imp +
    # primary_tumor_simplified +
    # tnm_stage_two_groups +
    # tumor_surgically_removed +
    # venous_insufficiency_imp +
    # mucinous_histology_imp,
  id = id,
  data = stepwise_training_set
  )

# Check proportional hazards assumption for Cox model
cox.zph(stepwise_cv_cph_final_model)
##                                             chisq df    p
## performance_status_category_corrected_imp 2.53248  2 0.28
## primary_tumor_simplified                  3.48145  3 0.32
## tnm_stage_two_groups                      0.00028  1 0.99
## tumor_surgically_removed                  0.00873  1 0.93
## vte_event                                 0.10323  1 0.75
## mucinous_histology_imp                    0.81547  1 0.37
## GLOBAL                                    5.79076  9 0.76
# Show HR and 95% CI
stepwise_cv_cph_final_model %>% 
  gtsummary::tbl_regression(exp = TRUE)
Characteristic HR1 95% CI1 p-value
performance_status_category_corrected_imp


    0
    1 1.10 0.71, 1.72 0.7
    2 4.19 2.38, 7.37 <0.001
primary_tumor_simplified


    Colorectal
    NSCLC 1.91 1.13, 3.24 0.016
    Oesophago-gastric 1.86 1.07, 3.23 0.028
    Pancreatic 3.09 1.81, 5.26 <0.001
tnm_stage_two_groups


    I-II-III
    IV 2.74 1.81, 4.13 <0.001
tumor_surgically_removed


    No
    Yes 0.28 0.14, 0.56 <0.001
vte_event 2.08 1.39, 3.11 <0.001
mucinous_histology_imp


    No
    Yes 1.40 0.94, 2.10 0.10
1 HR = Hazard Ratio, CI = Confidence Interval
# Inspect VIF in final model
vif_results = car::vif(stepwise_cv_cph_final_model)
## Warning in vif.default(stepwise_cv_cph_final_model): No intercept: vifs may not
## be sensible.
vif_results
##                                               GVIF Df GVIF^(1/(2*Df))
## performance_status_category_corrected_imp 1.260776  2        1.059643
## primary_tumor_simplified                  1.407597  3        1.058635
## tnm_stage_two_groups                      1.185783  1        1.088937
## tumor_surgically_removed                  1.303690  1        1.141793
## vte_event                                 1.111697  1        1.054370
## mucinous_histology_imp                    1.121929  1        1.059211
# C-index in testing set
concordance(stepwise_cv_cph_final_model, newdata = stepwise_testing_set)
## Call:
## concordance.coxph(object = stepwise_cv_cph_final_model, newdata = stepwise_testing_set)
## 
## n= 155 
## Concordance= 0.8973 se= 0.03567
## concordant discordant     tied.x     tied.y    tied.xy 
##       1038        110         20          0          0
# Akaike information criterion (AIC) value
AIC(stepwise_cv_cph_final_model)
## [1] 1276.862